def parseRaw(json_map):
url = json_map['url']
content = json_map['html']
return (url,content)
## getContent: for input aritcle, get it own word set via jieba.cut()
def getContent(x):
from bs4 import BeautifulSoup
soup = BeautifulSoup(x)
text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
import jieba
r = list()
for term in jieba.cut(text):
if len(term) > 1 and checkword(term): r.append(term)
return r
def checkword(x):
return all(u'\u4e00' <= c <= u'\u9fff' for c in x)
import json
travel_content = sc.textFile("./pixnet.txt").map(json.loads).map(parseRaw)
makeup_content = sc.textFile("./makeup.txt").map(json.loads).map(parseRaw)
print( x:x[0]).collect())
print( x:x[0]).collect())
travel_token = x : (x[0], getContent(x[1])))
makeup_token = x : (x[0], getContent(x[1])))
def countTokens(tokenRDD):
return x: len(x[1])).reduce(lambda a, b: a + b)
totalTokens = countTokens(travel_token) + countTokens(makeup_token)
print('There are %s tokens in full datasets' % totalTokens)
trainRDD = travel_token.union(makeup_token)
def findBiggestArticle(fullRDD):
return fullRDD.sortBy(lambda x: -len(x[1])).take(1)
biggestArticle = findBiggestArticle(trainRDD)
print('The biggest article with Link "%s" has the most tokens (%s)' % (biggestArticle[0][0],
TF-IDF 是一種常用於 Data-Mining 的文章權重計算方法,分別衡量一個斷詞在一篇文章及整個文件集的重要程度。
(inverse document frequency): 當一個詞在越少文章出現,其出現對文章的重要性就越大。TF
(term frequency): 當一個詞在單篇文章出現的頻率越大,其對文章的重要性也越大。對於每一篇文章,將其內部所有 token 的 TF、IDF值求出並相乘,將會產生這篇文章的 weighted vector。 最終,可以透過計算兩篇文章的 cosine similarity ,判斷這兩篇文章的相似度。
def tf(tokens):
d = {}
for word in tokens:
if not word in d:
d[word] = 1
d[word] += 1
for word in d:
d[word] = float(d[word])/len(tokens)
return d
travel_token_TF = record: tf(record[1]))
example_dict = travel_token_TF.take(1)[0]
example_dict_sorted = sorted(example_dict, key=example_dict.get, reverse=True)
print("Show 10 tokens with the higest frequency.")
for index in range(0,9):
print(example_dict_sorted[index], example_dict[example_dict_sorted[index]])
def idfs(RDD):
N = RDD.count()
uniqueTokens = x: list(set(x[1])))
tokenSumPairTuple = uniqueTokens.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda a, b : a + b)
return ( x: (x[0], float(N)/x[1])))
idfsTrain = idfs(trainRDD)
idfsTrainWeights = idfsTrain.collectAsMap()
uniqueTokenCount = idfsTrain.count()
print('There are %s unique tokens in the training datasets.' % uniqueTokenCount)
IDFTokens = idfsTrain.filter(lambda token: token[1] == 1).take(20) ##takeOrdered(10, lambda s: -s[1])
for token in IDFTokens:
print(token[0] + " " + str(token[1]))
IDFTokens = idfsTrain.filter(lambda token: token[1] == 10).take(20)
for token in IDFTokens:
print(token[0] + " " + str(token[1]))
def tfidf(tokens, idfs):
tfs = tf(tokens)
for tk in tfs:
tfs[tk] = tfs[tk]*idfs[tk]
tfIdfDict = tfs
return tfIdfDict
def showTopWord(link):
tokens = trainRDD.filter(lambda x: x[0] == link).collect()[0][1]
tokens_weights = tfidf(tokens, idfsTrainWeights)
tokens_weights_sorted = sorted(tokens_weights, key=tokens_weights.get, reverse=True)
for index in range(0,9):
print(tokens_weights_sorted[index], tokens_weights[tokens_weights_sorted[index]])
link = u''
import math
def dotprod(a, b):
dotsum = 0
for tk in a:
if tk in b:
dotsum += a[tk]*b[tk]
return dotsum
def norm(a):
return math.sqrt(dotprod(a,a))
def cossim(a, b):
return dotprod(a,b)/(norm(a) * norm(b))
def cosineSimilarity(string1, string2, idfsDictionary):
w1 = tfidf(string1, idfsDictionary)
w2 = tfidf(string2, idfsDictionary)
return cossim(w1, w2)
crossPair = (trainRDD
similarities = (crossPair
.map(lambda record:
(record[0][0], record[1][0], cosineSimilarity(record[0][1], record[1][1], idfsTrainWeights)))
def getSimilar(link):
return (similarities
.filter(lambda record: (record[0] == link))
.map(lambda record: (record[1], record[2]))
.sortBy(lambda x: -x[1]).collect())
similarArticle = getSimilar(u'')
for index in range(1, 4):
similarArticle = getSimilar(u'')
for index in range(1, 4):
